#' ---
#' title: "2022 Survey Recoding"
#' author: "Gento Kato & Fan Lu"
#' date: "August 23, 2023"
#' ---
#' 
#' # Preparation 
#' 

## Clean Up Space
rm(list=ls())

## Set Working Directory (Automatically) ##
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)); 

## Data Directory: 
## ** This is a data location of 0838_zenkoku.sav downloaded from SSJDA
## ** If downloaded to the same folder as this file, set filedir <- "./"
filedir <- "./"

## Import Original Data
do <- readRDS(paste0(filedir,"survey22_original_v7.rds"))  

## Library Psych Package
require(psych)

#'
#' # Data Manipulation
#'

# Initiate New Data Set
d <- data.frame(id = do$ResponseId)

#'
#' ## DEPENDENT variables of interest
#' 
#' ### The local election suffrage should be granted to foreigners.
#' 
#' * Original: 1=Strongly agree 5=Strongly disagree 6=DK 7=NA

attr(do$rights_foreigner_4, "labels")

# Original Variable
tmp <- do$rights_foreigner_4
table(tmp, useNA="always")
# Recoded Variable
d$foreignsuff <- (ifelse(is.na(tmp), NA, tmp)-1)/4
table(d$foreignsuff, useNA="always")
d$foreignsuff3 <- ifelse(d$foreignsuff==0.5,1,ifelse(d$foreignsuff>0.5,3,2))
d$foreignsuff3 <- factor(d$foreignsuff3, labels=c("Neither","Disagree","Agree"))
table(d$foreignsuff3, useNA="always")
d$foreignsuff3x <- factor(d$foreignsuff3, levels=c("Disagree","Neither","Agree"))
table(d$foreignsuff3x, useNA="always")

# Foreigner rights
psych::alpha(do[,c("rights_foreigner_1","rights_foreigner_2",
                   "rights_foreigner_4","rights_foreigner_5")])
tmp <- rowMeans(do[,c("rights_foreigner_1","rights_foreigner_2",
                      "rights_foreigner_4","rights_foreigner_5")])
table(tmp, useNA="always")
# d$foreignrights <- (5-ifelse(is.na(tmp), NA, tmp))/3
# table(d$foreignrights, useNA="always")
# d$foreignrights2 <- ifelse(d$foreignrights>0.5,1,0)
# table(d$foreignrights2, useNA="always")
tmp <- psych::fa(do[,c("rights_foreigner_1","rights_foreigner_2",
                       "rights_foreigner_4","rights_foreigner_5")])
d$foreignrights <- as.numeric(tmp$scores[,1])
d$foreignrights2 <- ifelse(d$foreignrights>0,1,0)
table(d$foreignrights2, useNA="always")

#'
#' ### Increase in immigrants
#'

## Backyard
tmp <- do$increase_foreigner_backyard
attr(do$increase_foreigner_backyard, "labels")
table(tmp, useNA="always")

d$immigincrease <- (ifelse(is.na(tmp), NA, tmp)-1)/3
table(d$immigincrease, useNA="always")
d$immigincrease2 <- ifelse(d$immigincrease>0.5,1,0)
table(d$immigincrease2, useNA="always")

## Nation
tmp <- do$increase_foreigner_japan
table(tmp, useNA="always")

d$immigincrease_alt <- (ifelse(is.na(tmp), NA, tmp)-1)/3
table(d$immigincrease_alt, useNA="always")
d$immigincrease2_alt <- ifelse(d$immigincrease_alt>0.5,1,0)
table(d$immigincrease2_alt, useNA="always")


#'
#' ### Contact With Foreigners
#'

## All Foreigners

d$foreigncontact_coworker <- 
  ifelse(do$contact_foreigner_1_1%in%1 + do$contact_foreigner_1_2%in%1 + 
           do$contact_foreigner_1_3%in%1 + do$contact_foreigner_1_4%in%1 + 
           do$contact_foreigner_1_5%in%1 + do$contact_foreigner_1_6%in%1 >0,1,0)
d$foreigncontact_costudent <- 
  ifelse(do$contact_foreigner_2_1%in%1 + do$contact_foreigner_2_2%in%1 + 
           do$contact_foreigner_2_3%in%1 + do$contact_foreigner_2_4%in%1 + 
           do$contact_foreigner_2_5%in%1 + do$contact_foreigner_2_6%in%1 >0,1,0)
d$foreigncontact_friend <- 
  ifelse(do$contact_foreigner_3_1%in%1 + do$contact_foreigner_3_2%in%1 + 
           do$contact_foreigner_3_3%in%1 + do$contact_foreigner_3_4%in%1 + 
           do$contact_foreigner_3_5%in%1 + do$contact_foreigner_3_6%in%1 >0,1,0)
d$foreigncontact_relative <- 
  ifelse(do$contact_foreigner_4_1%in%1 + do$contact_foreigner_4_2%in%1 + 
           do$contact_foreigner_4_3%in%1 + do$contact_foreigner_4_4%in%1 + 
           do$contact_foreigner_4_5%in%1 + do$contact_foreigner_4_6%in%1 >0,1,0)
d$foreigncontact_intlgroup <- 
  ifelse(do$contact_foreigner_5_1%in%1 + do$contact_foreigner_5_2%in%1 + 
           do$contact_foreigner_5_3%in%1 + do$contact_foreigner_5_4%in%1 + 
           do$contact_foreigner_5_5%in%1 + do$contact_foreigner_5_6%in%1 >0,1,0)
d$foreigncontact_othgroup <- 
  ifelse(do$contact_foreigner_6_1%in%1 + do$contact_foreigner_6_2%in%1 + 
           do$contact_foreigner_6_3%in%1 + do$contact_foreigner_6_4%in%1 + 
           do$contact_foreigner_6_5%in%1 + do$contact_foreigner_6_6%in%1 >0,1,0)
d$foreigncontact_greeting <- 
  ifelse(do$contact_foreigner_7_1%in%1 + do$contact_foreigner_7_2%in%1 + 
           do$contact_foreigner_7_3%in%1 + do$contact_foreigner_7_4%in%1 + 
           do$contact_foreigner_7_5%in%1 + do$contact_foreigner_7_6%in%1 >0,1,0)
d$foreigncontact <- 
  d$foreigncontact_coworker + d$foreigncontact_costudent + 
  d$foreigncontact_friend + d$foreigncontact_relative + 
  d$foreigncontact_intlgroup + d$foreigncontact_othgroup + 
  d$foreigncontact_greeting
hist(d$foreigncontact)

## Check Internal Consistency alpha = 0.85
psych::alpha(cbind(d$foreigncontact_coworker,d$foreigncontact_costudent, 
                   d$foreigncontact_friend,d$foreigncontact_relative, 
                   d$foreigncontact_intlgroup,d$foreigncontact_othgroup, 
                   d$foreigncontact_greeting))

cor(d$foreigncontact, d$immigincrease, use="pairwise")
cor(d$foreigncontact, d$foreignsuff, use="pairwise")

#'
#' ## PREDICTORS
#' 
#' ### Education (Ordinal)
#' 
#' * Recoded: 1= "<=SHS", 2="Junior College/Vocational School", 3=">=College" 
#' 

# Original
tmp <- do$edu
table(tmp, useNA="always")
# Recoded
d$edu <- ifelse(tmp==5, NA, ifelse(tmp==1, 1, tmp-1))
# Make it a Factor
d$edu <- factor(d$edu, labels = c("<=SHS",
                                  ">SHS & <College(4yr)",
                                  ">=College(4yr)"))
table(d$edu, useNA="always")

# Education Treatment 
d$edu2 <- ifelse(d$edu==">=College(4yr)",1,0)
table(d$edu2, useNA="always")

#'
#' # Parents' education
#'

table(do$marrykids, useNA="always")
table(do$family_edu_unmarried_1, useNA="always")
table(do$family_edu_unmarried_2, useNA="always")
table(do$family_edu_married_1, useNA="always")
table(do$family_edu_married_2, useNA="always")

## Dad ##

# Original
tmp <- ifelse(is.na(do$family_edu_unmarried_1),
              do$family_edu_married_1,
              do$family_edu_unmarried_1)
table(tmp, useNA="always")
# Recoded
d$edu_dad <- ifelse(tmp==5, NA, ifelse(tmp==1, 1, tmp-1))
# Make it a Factor
d$edu_dad <- factor(d$edu_dad, labels = c("<=SHS",
                                          ">SHS & <College(4yr)",
                                          ">=College(4yr)"))
table(d$edu_dad, useNA="always")
d$edu2_dad <- ifelse(d$edu_dad==">=College(4yr)",1,0)

## Mom ##

## Original
tmp <- ifelse(is.na(do$family_edu_unmarried_2),
              do$family_edu_married_2,
              do$family_edu_unmarried_2)
table(tmp, useNA="always")
# Recoded
d$edu_mom <- ifelse(tmp==5, NA, ifelse(tmp==1, 1, tmp-1))
# Make it a Factor
d$edu_mom <- factor(d$edu_mom, labels = c("<=SHS",
                                          ">SHS & <College(4yr)",
                                          ">=College(4yr)"))
table(d$edu_mom, useNA="always")
d$edu2_mom <- ifelse(d$edu_mom==">=College(4yr)",1,0)

########################
## Parent (max value) ##
########################

d$edu_parent <- 
  factor(ifelse(d$edu_dad==">=College(4yr)"|
                  d$edu_mom==">=College(4yr)",
                ">=College(4yr)",
                ifelse(d$edu_dad==">SHS & <College(4yr)"|
                         d$edu_mom==">SHS & <College(4yr)",
                       ">SHS & <College(4yr)","<=SHS")),
         levels = levels(d$edu_dad))

d$edu2_parent <- ifelse(d$edu_parent==">=College(4yr)",1,0)

#' 
#' ### Gender
#' 
#' * Original: 1=male 2=female 3=NA
#' * Recoded: 0=male, 1=female
#' 

# Original
tmp <- do$gender
table(tmp, useNA="always")
# Recoded
d$female <- ifelse(tmp==3, NA, tmp-1)
table(d$female, useNA="always")
d$male <- 1 - d$female

#'
#' ### Age
#'
#' * Recoded (Categorical):

# Original
tmp <- do$age
table(tmp, useNA="always")
tmp <- ifelse(tmp==3,30,tmp)
table(tmp)
d$age <- tmp

## Recoded Born Year (by Academic Year: April-March)
d$bornyr <- NA
d$bornyr <- 2022 - d$age

## Academic Year of Entering College
# The survey was on March, so assume that they haven't turn to 19 yet.
d$univyr <-  2022 - (d$age-18)
unique(d$univyr[which(d$age==18)]) # If you are 18, 2022 is the year to enter

# Recoded Categorical
d$agecat <- NA
d$agecat[d$age >= 60] <- "Elder (>=60s)"
d$agecat[d$age >= 40 & d$age < 60] <- "Middle Aged (40-50s)"
d$agecat[d$age < 40] <- "Young (<=30s)"
## coerce new character variable into a factor variable
d$agecat <- factor(d$agecat, levels=c("Young (<=30s)",
                                      "Middle Aged (40-50s)",
                                      "Elder (>=60s)"))
table(d$agecat, useNA="always") 

# Recoded Cohort
## Cohort I (-1975 Expansion) 
## Cohort II (1975-1990 Stagnation) 
## Cohort III (1990-2000 Expansion) 
## Cohort IV (2000- Universal) 
d$cohort <- NA
d$cohort[which(d$univyr<1975)] <- 1
d$cohort[which(d$univyr>=1975 & d$univyr<1990)] <- 1
d$cohort[which(d$univyr>=1990 & d$univyr<2000)] <- 2
d$cohort[which(d$univyr>=2000 & d$univyr<2010)] <- 3
d$cohort[which(d$univyr>=2010)] <- 4
## A factor variable
d$cohort <- factor(d$cohort, labels=c(#"Cohort I (18+ in -1975)",
  #"Cohort II (18+ in 1976-1989)",
  "Cohort I & II (18+ in -1989)",
  "Cohort III (18+ in 1990-99)",
  "Cohort IV (18+ in 2000-09)",
  "Cohort V (18+ in 2010-)"))
table(d$cohort, useNA="always")
# fa
# table(tmp, useNA="always")
# 
# d$married <- ifelse(tmp%in%c(1,2,3),1,0)
# table(d$married)

#'
#' ### Income
#'

# Original
tmp <- do$income
table(tmp, useNA="always")
# Recoded
## Percentile Conversion Function
convper <- function(old.var,missing.val){
  r <- old.var
  r[r %in% missing.val] <- NA
  rt <- cumsum(table(r)/sum(table(r))) # Cumulative Percentile
  rt <- rt - diff(c(0,rt))/2 # Take Midpoints 
  r <- rt[match(r, names(rt))]
  return(r)
}
d$income <- convper(tmp, c(88,99))
table(d$income, useNA="always")

d$incomecat <- NA
d$incomecat[which(d$income<=0.33)] <- "Low"
d$incomecat[which(d$income>0.33 & d$income<=0.67)] <- "Middle"
d$incomecat[which(d$income>0.67)] <- "High"
d$incomecat[which(tmp%in%c(88,99)|is.na(tmp))] <- "Missing"
d$incomecat <- factor(d$incomecat, levels=c("Low","Middle","High","Missing"))
table(d$incomecat, useNA="always") 

#'
#' # Jobs
#'

## Working Status
tmp <- do$employment
table(tmp, useNA="always")

d$workstat <- ifelse(tmp%in%c(9)|is.na(tmp),NA,
                     ifelse(tmp%in%c(1),"Self-Employed/Full-Time/Managerial",
                            ifelse(tmp%in%c(2),"Self-Employed/Full-Time/Managerial",
                                   ifelse(tmp%in%c(3),"Student/Housemaker/Part-Time",
                                          ifelse(tmp%in%c(4),"Self-Employed/Full-Time/Managerial",
                                                 ifelse(tmp%in%c(5,6),"Student/Housemaker/Part-Time",
                                                        "Not Employed"))))))
d$workstat <- factor(d$workstat, levels=rev(c("Self-Employed/Full-Time/Managerial","Student/Housemaker/Part-Time","Not Employed")))
table(d$workstat, useNA="always")

# d$manager <- ifelse(d$workstat%in%"Managerial",1,0)
# table(d$manager)

d$employed <- ifelse(d$workstat=="Not Employed",0,1)
table(d$employed)

#'
#' ### Marital Status
#'

## Marital/Kids Status
tmp <- do$marrykids
table(tmp, useNA="always")

d$married <- ifelse(is.na(tmp),NA,ifelse(tmp%in%c(1,2,3),1,0))
table(d$married)

#'
#' # Family Rights
#'

table(do$ide_13)
table(do$ide_14)

d$samesexmar <- (ifelse(is.na(do$ide_13),0,do$ide_13)+3)/6
d$hufubessei <- (ifelse(is.na(do$ide_14),0,do$ide_14)+3)/6

psych::alpha(cbind(d$hufubessei,d$samesexmar))
tmp <- psych::fa(cbind(d$hufubessei,d$samesexmar))
hist((tmp$scores), 10)
d$famrights <- tmp$scores

#'
#' # Urban Rural
#'

table(do$urban)
d$urban <- ifelse(do$urban==6,NA,5-do$urban)/4
table(d$urban, useNA="always")

#'
#' # Urban Rural in Childhood
#'

table(do$urban_home)
d$urban_home <- ifelse(do$urban_home==6,NA,5-do$urban_home)/4
d$urban_home[do$urban_home%in%7] <- d$urban[do$urban_home%in%7]
table(d$urban_home, useNA="always")

## Binary, town/village or not
d$urban_home2 <- ifelse(d$urban_home>0,1,0)

#'
#' # Saving Data
#'

#+ eval=FALSE
saveRDS(d, "data_survey22_v7.rds")
